{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#不用print即可输出多个变量\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导入库\n",
    "**paddle.fluid**--->PaddlePaddle深度学习框架  \n",
    "**numpy**---------->python基本库，用于科学计算  \n",
    "**os**------------->python的模块，可使用该模块对操作系统进行操作  \n",
    "**matplotlib**----->python绘图库，可方便绘制折线图、散点图等图形  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import paddle.fluid as fluid\n",
    "import paddle\n",
    "import numpy as np\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_path='data/enterprise_profit_forecast'\n",
    "train=pd.read_csv('{}/output/train.csv'.format(task_path))\n",
    "test=pd.read_csv('{}/output/test.csv'.format(task_path))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "train_reader和test_reader\n",
    "\n",
    "paddle.reader.shuffle()表示每次缓存BUF_SIZE个数据项，并进行打乱\n",
    "\n",
    "paddle.batch()表示每BATCH_SIZE组成一个batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 8.000000e+00,  1.190000e+03,  3.000000e+00,  5.000000e+00,\n",
       "        0.000000e+00,  1.000000e+00,  9.100000e-01,  1.000000e+00,\n",
       "        0.000000e+00,  0.000000e+00,  3.332000e+03,  2.665600e+02,\n",
       "        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,\n",
       "        0.000000e+00,  0.000000e+00,  4.760000e+03,  3.808000e+02,\n",
       "        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,\n",
       "        0.000000e+00,  0.000000e+00,  7.400000e+01,  3.332000e+04,\n",
       "        6.426000e+04,  1.466080e+05,  1.026256e+05,  7.330400e+04,\n",
       "       -1.466080e+04,  0.000000e+00, -3.094000e+04,  6.310000e+02,\n",
       "        4.760000e+04,  6.961500e+04,  5.236000e+04,  2.094400e+04,\n",
       "        1.570800e+04, -5.236000e+03,  0.000000e+00, -2.201500e+04,\n",
       "        8.353800e+03])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for sam in train.values[:,1:]:\n",
    "    sam\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reader(df):\n",
    "    data=df.values[:,1:]\n",
    "    for sample in data:\n",
    "        yield sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def splitData(df):\n",
    "    train=df.sample(frac=0.8)\n",
    "    test=df[~df.index.isin(train.index)]\n",
    "    return train.values[:,1:],test.values[:,1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data,test_data=splitData(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(623475, 1)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data=train_data.reshape(-1,1)\n",
    "test_data=test_data.reshape(-1,1)\n",
    "train_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reader(data):\n",
    "    for sample in data:\n",
    "        yield sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "BUF_SIZE=500\n",
    "BATCH_SIZE=20\n",
    "\n",
    "#用于训练的数据提供器，每次从缓存中随机读取批次大小的数据\n",
    "train_reader=paddle.batch(fluid.io.shuffle(reader(train_data),buf_size=BUF_SIZE),batch_size=BATCH_SIZE)\n",
    "test_reader=paddle.batch(fluid.io.shuffle(reader(test_data),buf_size=BUF_SIZE),batch_size=BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17319, 46)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(4068, 45)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 网络配置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 网络搭建\n",
    "对于线性回归来讲，它就是一个从输入到输出的简单的全连接层"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![https://aistudio.baidu.com/aistudio/projectdetail/78917](https://ai-studio-static-online.cdn.bcebos.com/f3f567e7aa7a499fb3abb767aaaa1b943eed373cf4694d87beb382e161ea8edc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![https://aistudio.baidu.com/aistudio/projectdetail/78917](https://ai-studio-static-online.cdn.bcebos.com/f6c44a96e1624828829bbd438c29c17ead9ecc45c68b4310bfbbb0a4dc96c3fe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#定义张量变量x，表示44维的特征值\n",
    "x = fluid.layers.data(name='x', shape=[44], dtype='float32')\n",
    "#定义张量y，表示目标值\n",
    "y = fluid.layers.data(name='y', shape=[1], dtype='float32')\n",
    "#定义一个简单的线性网络,连接输入和输出的全连接层\n",
    "#input:输入tensor;\n",
    "#size:该层输出单元的数目\n",
    "#act:激活函数\n",
    "y_predict=fluid.layers.fc(input=x,size=1,act=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 定义损失函数\n",
    "此处使用均方差损失函数。\n",
    "\n",
    "square_error_cost(input,lable):接受输入预测值和目标值，并返回方差估计,即为（y-y_predict）的平方"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "cost = fluid.layers.square_error_cost(input=y_predict, label=y) #求一个batch的损失值\n",
    "avg_cost = fluid.layers.mean(cost)                              #对损失值求平均值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 定义优化函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.001)\n",
    "opts = optimizer.minimize(avg_cost)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_program = fluid.default_main_program().clone(for_test=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在上述模型配置完毕后，得到两个fluid.Program：fluid.default_startup_program() 与fluid.default_main_program() 配置完毕了。\n",
    "\n",
    "参数初始化操作会被写入**fluid.default_startup_program()**\n",
    "\n",
    "**fluid.default_main_program()**用于获取默认或全局main program(主程序)。该主程序用于训练和测试模型。fluid.layers 中的所有layer函数可以向 default_main_program 中添加算子和变量。default_main_program 是fluid的许多编程接口（API）的Program参数的缺省值。例如,当用户program没有传入的时候， Executor.run() 会默认执行 default_main_program 。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练与评估"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 创建Executor\n",
    "首先定义运算场所 fluid.CPUPlace()和 fluid.CUDAPlace(0)分别表示运算场所为CPU和GPU\n",
    "\n",
    "Executor:接收传入的program，通过run()方法运行program。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "use_cuda = True                         #use_cuda为False,表示运算场所为CPU;use_cuda为True,表示运算场所为GPU           \n",
    "place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()\n",
    "exe = fluid.Executor(place)              #创建一个Executor实例exe\n",
    "exe.run(fluid.default_startup_program()) #Executor的run()方法执行startup_program(),进行参数初始化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 定义输入数据维度\n",
    "DataFeeder负责将数据提供器（train_reader,test_reader）返回的数据转成一种特殊的数据结构，使其可以输入到Executor中。\n",
    "\n",
    "feed_list设置向模型输入的向变量表或者变量表名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义输入数据维度\n",
    "feeder = fluid.DataFeeder(place=place, feed_list=[x, y])#feed_list:向模型输入的变量表或变量表名"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 定义绘制训练过程的损失值变化趋势的方法draw_train_process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "iter=0;\n",
    "iters=[]\n",
    "train_costs=[]\n",
    "\n",
    "def draw_train_process(iters,train_costs):\n",
    "    title=\"training cost\"\n",
    "    plt.title(title, fontsize=24)\n",
    "    plt.xlabel(\"iter\", fontsize=14)\n",
    "    plt.ylabel(\"cost\", fontsize=14)\n",
    "    plt.plot(iters, train_costs,color='red',label='training cost') \n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 训练并保存模型\n",
    "Executor接收传入的program,并根据feed map(输入映射表)和fetch_list(结果获取表) 向program中添加feed operators(数据输入算子)和fetch operators（结果获取算子)。 feed map为该program提供输入数据。fetch_list提供program训练结束后用户预期的变量。\n",
    "\n",
    "注：enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'generator' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-59-5f919dbd5b7a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;31m# 开始训练并输出最后一个batch的损失值\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m     \u001b[0mtrain_cost\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m     \u001b[1;32mfor\u001b[0m \u001b[0mbatch_id\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_reader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m              \u001b[1;31m#遍历train_reader迭代器\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      8\u001b[0m         train_cost = exe.run(program=fluid.default_main_program(),#运行主程序\n\u001b[0;32m      9\u001b[0m                              \u001b[0mfeed\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfeeder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m              \u001b[1;31m#喂入一个batch的训练数据，根据feed_list和data提供的信息，将输入数据转成一种特殊的数据结构\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\envs\\paddle\\lib\\site-packages\\paddle\\batch.py\u001b[0m in \u001b[0;36mbatch_reader\u001b[1;34m()\u001b[0m\n\u001b[0;32m     56\u001b[0m         \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     57\u001b[0m         \u001b[0mb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 58\u001b[1;33m         \u001b[1;32mfor\u001b[0m \u001b[0minstance\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     59\u001b[0m             \u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minstance\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     60\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\envs\\paddle\\lib\\site-packages\\paddle\\reader\\decorator.py\u001b[0m in \u001b[0;36mdata_reader\u001b[1;34m()\u001b[0m\n\u001b[0;32m    133\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mdata_reader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    134\u001b[0m         \u001b[0mbuf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 135\u001b[1;33m         \u001b[1;32mfor\u001b[0m \u001b[0me\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mreader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    136\u001b[0m             \u001b[0mbuf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    137\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[0mbuf_size\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: 'generator' object is not callable"
     ]
    }
   ],
   "source": [
    "EPOCH_NUM=50\n",
    "model_save_dir = \"data/enterprise_profit_forecast/model/fit_a_line.inference.model\"\n",
    "\n",
    "for pass_id in range(EPOCH_NUM):                                  #训练EPOCH_NUM轮\n",
    "    # 开始训练并输出最后一个batch的损失值\n",
    "    train_cost = 0\n",
    "    for batch_id, data in enumerate(train_reader()):              #遍历train_reader迭代器\n",
    "        train_cost = exe.run(program=fluid.default_main_program(),#运行主程序\n",
    "                             feed=feeder.feed(data),              #喂入一个batch的训练数据，根据feed_list和data提供的信息，将输入数据转成一种特殊的数据结构\n",
    "                             fetch_list=[avg_cost])    \n",
    "        if batch_id % 40 == 0:\n",
    "            print(\"Pass:%d, Cost:%0.5f\" % (pass_id, train_cost[0][0]))    #打印最后一个batch的损失值\n",
    "        iter=iter+BATCH_SIZE\n",
    "        iters.append(iter)\n",
    "        train_costs.append(train_cost[0][0])\n",
    "       \n",
    "   \n",
    "    # 开始测试并输出最后一个batch的损失值\n",
    "    test_cost = 0\n",
    "    for batch_id, data in enumerate(test_reader()):               #遍历test_reader迭代器\n",
    "        test_cost= exe.run(program=test_program, #运行测试cheng\n",
    "                            feed=feeder.feed(data),               #喂入一个batch的测试数据\n",
    "                            fetch_list=[avg_cost])                #fetch均方误差\n",
    "    print('Test:%d, Cost:%0.5f' % (pass_id, test_cost[0][0]))     #打印最后一个batch的损失值\n",
    "    \n",
    "    #保存模型\n",
    "    # 如果保存路径不存在就创建\n",
    "if not os.path.exists(model_save_dir):\n",
    "    os.makedirs(model_save_dir)\n",
    "print ('save models to %s' % (model_save_dir))\n",
    "#保存训练参数到指定路径中，构建一个专门用预测的program\n",
    "fluid.io.save_inference_model(model_save_dir,   #保存推理model的路径\n",
    "                                  ['x'],            #推理（inference）需要 feed 的数据\n",
    "                                  [y_predict],      #保存推理（inference）结果的 Variables\n",
    "                                  exe)              #exe 保存 inference model\n",
    "draw_train_process(iters,train_costs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:paddle]",
   "language": "python",
   "name": "conda-env-paddle-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
